---
date: "`r Sys.Date()`"
output: html_document
---

```{r setup, include=FALSE}

knitr::opts_chunk$set(echo = FALSE, comment = "", message = FALSE, warning = FALSE, fig.height = 8, fig.width = 10)

load("glm_res.Rdata")

```

# Cerebral oxygenation in cerebral malaria: logistic regression analysis {.tabset}

```{r data}

library(tidyverse)

# read in data
df_hbox <- read.csv("Data/98 first visit records with cerebral oxygenation stats_trimmed.csv") %>% as_tibble()

# get rid of "Other"
df_hbox <- df_hbox %>% 
  filter(Status != "Other") %>% 
  mutate(Status = factor(Status, levels = c("HV", "UM", "CM")))

# identify duplicated rows (TM0003, TM2001)
df_hbox <- df_hbox %>% filter(duplicated(Subject.ID..NIAID.) == FALSE)

# Convert age to numeric
df_hbox <- df_hbox %>% 
  mutate(Age = gsub(" Years, ", "_", Age)) %>% 
  mutate(Age = gsub(" Months ", "", Age)) %>% 
  separate(Age, into = c("Years", "Months"), sep = "_") %>% 
  mutate(Years = as.numeric(Years)) %>% 
  mutate(Months = as.numeric(Months)/12) %>% 
  mutate(Age = Years + Months) %>% 
  dplyr::select(-c(Years, Months))

# Convert sex to factor
df_hbox <- df_hbox %>% mutate(Sex = as.factor(Sex))

# Trim for variables of interest (VOI)
df_hbox_trim <- df_hbox %>% 
  dplyr::select(c(Subject.ID..NIAID., Status, Glucose, Hematocrit, Lactate, Temperature,
                  Arginine.umol.L, Haptoglobin..mg.dl., Hemoglobin..uM., Whole.Blood.Nitrite, 
                  cerebral.hbdeox, cerebral.hbdeox.alpha, cerebral.hbox, cerebral.hbox.alpha,
                  cerebral.sat, cerebral.sat.alpha, cerebral.thc, cerebral.thc.alpha, cerebral.thc.norm.std)) %>% 
  
  dplyr::rename("subject_id" = "Subject.ID..NIAID.", "Arginine (umol/L)" = "Arginine.umol.L", "Haptoglobin (mg/dL)" = "Haptoglobin..mg.dl.", 
                "Hemoglobin (uM)" = "Hemoglobin..uM.", "WB_Nitrite" = "Whole.Blood.Nitrite", "DeoxyHb" = "cerebral.hbdeox", 
                "DeoxyHb_alpha" = "cerebral.hbdeox.alpha", "OxyHb" = "cerebral.hbox", "OxyHb_alpha" = "cerebral.hbox.alpha", 
                "O2sat" = "cerebral.sat", "O2sat_alpha" = "cerebral.sat.alpha", "THC" = "cerebral.thc", 
                "THC_alpha" = "cerebral.thc.alpha", "THC_sd" = "cerebral.thc.norm.std") 

# impute missing data with median by group
f_impute <- function(x, na.rm = TRUE) (replace(x, is.na(x), median(x, na.rm = na.rm)))

df_hbox_impute <- df_hbox_trim %>% 
  group_by(Status) %>% 
  mutate_at(3:ncol(.), f_impute) %>% 
  ungroup()

# pivot long
df_hbox_long <- df_hbox_trim %>% pivot_longer(cols = 3:ncol(.), names_to = "measurement", values_to = "value")

```

## Exploratory analysis

```{r voi}

# filter to just UM and CM
df_hbox_log <- df_hbox_impute %>% filter(Status != "HV") %>% select(-subject_id)

# select only VOI
df_hbox_log_voi <- df_hbox_log %>% select(1, 10:18)

# convert CM vs UM to 1 and 0
df_hbox_voi_01 <- df_hbox_log_voi %>% mutate(Status = ifelse(Status == "CM", 1, 0) %>% as.factor())

```

```{r voi_plot}

df_hbox_log_voi %>% pivot_longer(2:10, names_to = "variable", values_to = "value") %>% 
  
  ggplot(aes(x = Status, y = value)) +
  geom_violin(aes(color = Status)) +
  geom_jitter(position = position_jitter(0.2), shape = 1) +
  stat_summary(fun = "median", geom = "crossbar", aes(color = Status), size = 0.2, width = 0.5) +
  facet_wrap(~variable, scales = "free_y") +
  
  theme_bw() +
  theme(legend.position = "none")  +
  
  theme(strip.text = element_text(size = 15),
        axis.title = element_text(size = 15),
        axis.title.y = element_blank(),
        axis.text = element_text(size = 15))

```

```{r histogram, fig.height = 6}

# histogram

df_hbox_log_voi %>% pivot_longer(2:10, names_to = "variable", values_to = "value") %>% 
  
  ggplot(aes(x = value)) +
  geom_histogram(bins = 20) +
  facet_grid(Status ~ variable, scales = "free") +
  
  theme(strip.text.y = element_text(size = 15),
        strip.text.x = element_text(size = 11),
        axis.title.x = element_blank(),
        axis.title.y = element_text(size = 15),
        axis.text = element_text(size = 12),
        axis.text.x = element_text(angle = 45, hjust = 0.9))

```

## PCA & variable correlations
```{r pca}

library(factoextra)

# convert tibble to df
df_hbox_voi_df <- df_hbox_log_voi %>% dplyr::select(-Status) %>% as.data.frame() 
rownames(df_hbox_voi_df) <- df_hbox_impute %>% dplyr::filter(Status != "HV") %>% .$subject_id

# compute PCA
my_pca <- prcomp(df_hbox_voi_df, scale = TRUE)

# define groups
groups <- as.factor(df_hbox_log_voi$Status)

# Eigenvalues
pca_eig_val <- get_eigenvalue(my_pca)

# two data frames for plotting
df_vars <- my_pca$rotation %>% as_tibble() %>% mutate(variable = rownames(my_pca$rotation))
df_pnts <- my_pca$x %>% as_tibble() %>% mutate(group = groups)

# plot

library(ggpubr)
library(ggrepel)

  # plot patients
  ggplot() +
  geom_point(data = df_pnts, mapping = aes(x = PC1, y = PC2, color = group), 
             size = 3) +
  stat_conf_ellipse(data = df_pnts, mapping = aes(x = PC1, y = PC2, color = group, fill = group),
                    level = 0.95, npoint = 100, bary = TRUE, alpha = 0.1, geom = "polygon") +
  stat_mean(data = df_pnts, mapping = aes(x = PC1, y = PC2, color = group, shape = group),
            na.rm = FALSE) +
  
  # plot variables
  geom_point(data = df_vars, aes(x = PC1*10, y = PC2*10)) +
  geom_text_repel(data = df_vars, aes(x = PC1*10, y = PC2*10, label = variable, size = 10), show.legend = FALSE) +
  geom_segment(data = df_vars, aes(x = 0, y = 0, xend = PC1*10, yend = PC2*10), arrow = arrow()) +

  # draw lines
  geom_vline(xintercept = 0, lty = 2, color = "black") +
  geom_hline(yintercept = 0, lty = 2, color = "black") +
  
  # design
  xlab(paste0("PC1: ", round(pca_eig_val$variance.percent[1], 2), "% of variance")) +
  ylab(paste0("PC1: ", round(pca_eig_val$variance.percent[2], 2), "% of variance")) +

  ggtitle("PCA") +
  theme_bw() 


```

```{r correlations, fig.show = "hold", out.width = "50%", fig.height = 12, fig.width = 15}

# calculate spearman correlations in each group

f_calc_cor <- function(df, status) {
  
  df %>%
    filter(Status == status) %>% 
    dplyr::select(-Status) %>% 
    cor(method = "spearman")
  
}

m_cor_hv <- df_hbox_impute %>% dplyr::select(c(2, 11:19)) %>% f_calc_cor(status = "HV")

m_cor_um <- df_hbox_impute %>% dplyr::select(c(2, 11:19)) %>% f_calc_cor(status = "UM")

m_cor_cm <- df_hbox_impute %>% dplyr::select(c(2, 11:19)) %>% f_calc_cor(status = "CM")

# correlation matrix

  library(reshape)
  library(ggpubr)
  
  # write plot function
  f_plot_cor <- function(mat) {
    
    melt(mat) %>% 
      
      ggplot(aes(x = X1, y = X2, fill = value, label = round(value, 2))) + 
      geom_tile() +
      scale_fill_gradient2(low = "#4575b4", high = "#d73027") +
      geom_text(size = 8) +
      
      xlab("") +
      ylab("") +
      theme(axis.text.x = element_text(angle = 45, hjust = 0.9),
            axis.text = element_text(size = 35),
            legend.text = element_text(size = 40),
            legend.title = element_text(size = 50),
            legend.key.size = unit(2, 'cm'),
            plot.title = element_text(size = 30)
            )
    
  }
  
  # extract legend
  p_leg <- m_cor_hv %>% f_plot_cor() %>% get_legend()
  
  # create plots
  m_cor_hv %>% f_plot_cor() + ggtitle("HV variable correlations") + theme(legend.position = "none")
  
  m_cor_um %>% f_plot_cor() + ggtitle("UM variable correlations") + theme(legend.position = "none")
  
  m_cor_cm %>% f_plot_cor() + ggtitle("CM variable correlations") + theme(legend.position = "none")
  
  as_ggplot(p_leg)

```

## Logistic regression {.tabset}

### Explore data
```{r log_plot}

df_hbox_voi_01 %>% 
  pivot_longer(2:10, names_to = "variable", values_to = "value") %>% 
  mutate(Status = as.numeric(as.character(Status))) %>% 
  
  ggplot(aes(x = value, y = Status)) +
  geom_point() +
  geom_smooth(method = "glm", method.args = list(family = "binomial")) +
  facet_wrap(~variable, scales = "free_x") +
  
  ylab("Status: 1 = CM, 0 = UM") +
  xlab("") +
  theme_bw()

```

### Model 1

#### Build

- logistic regression model
- variables: DeoxyHb, DeoxyHb_alpha, OxyHb, OxyHb_alpha, O2sat, O2sat_alpha, THC, THC_alpha, THC_sd
- bootstrap x1000
- BoxCox transformation

```{r split_boot}

library(tidymodels)
library(rsample)

# resample
set.seed(234)
hbox_boot <- bootstraps(df_hbox_voi_01, times = 1000)

```

```{r model_spec}

# recipe
hbox_rec <- recipe(Status ~ ., data = df_hbox_voi_01) %>% 
  step_BoxCox(all_predictors()) %>% 
  step_nzv(all_predictors())

# workflow
hbox_wf <- workflow() %>% 
  add_recipe(hbox_rec)

# logistic regression model spec
glm_spec <- logistic_reg() %>% 
  set_engine("glm")

```

```{r train_model, eval = FALSE}

glm_res <- hbox_wf %>% 
  add_model(glm_spec) %>%
  fit_resamples(
    resamples = hbox_boot,
    metrics = metric_set(roc_auc, accuracy, sensitivity, specificity),
    control = tune::control_resamples(save_pred = TRUE)
  )

```

```{r best}

# select best model to be used for evaluation
glm_best <- glm_res %>% select_best("roc_auc")

# finalize model
hbox_final <- hbox_wf %>% 
  add_model(glm_spec) %>% 
  finalize_workflow(glm_best) %>%
  fit(df_hbox_voi_01) %>%
  pull_workflow_fit()

```

#### Evaluate

```{r eval_training}
 
# collect metrics
collect_metrics(glm_res) %>% 
  dplyr::select(.metric, mean, n, std_err) %>% 
  as.data.frame()

# confusion matrix on training data
glm_res %>% 
  conf_mat_resampled() %>% 
  mutate(Prediction = ifelse(Prediction == 1, "CM", "UM")) %>% 
  mutate(Truth = ifelse(Truth == 1, "CM", "UM")) %>% 
  
  # plot
  ggplot(aes(x = Truth, y = Prediction, fill = Freq, label = Freq)) +
  geom_tile() +
  geom_text(size = 6) +
  scale_fill_gradient2(low = "#4575b4", high = "#d73027") +
  
  ggtitle("Confusion matrix") +
  theme_bw()

```

```{r roc, eval = FALSE}

glm_res %>% 
  collect_predictions() %>% 
  group_by(id) %>% 
  roc_curve(Status, .pred_0) %>% 
  
  ggplot(aes(1 - specificity, sensitivity, color = id)) +
  geom_abline(lty = 2, color = "gray60", size = 1.5) +
  geom_path(show.legend = FALSE, alpha = 0.6, size = 1.2) +
  coord_equal() +
  ggtitle("ROC") +
  theme_bw()

```

Coefficients
```{r coefficients}

hbox_final %>% 
  tidy() %>% 
  as.data.frame()

```

Odds ratios
```{r OR}

# OR
hbox_final %>% 
  tidy(exponentiate = TRUE) %>% 
  arrange(desc(estimate)) %>% 
  as.data.frame()

```

### Model 2

#### Build

- logistic regression model
- variables: THC_alpha, THC_sd
- bootstrap x1000
- BoxCox transformation

```{r split_boot2}

library(tidymodels)
library(rsample)

# select only significant variables
df_hbox_model2 <- df_hbox_voi_01 %>% dplyr::select(c(Status, THC_alpha, THC_sd))

# resample
set.seed(789)
hbox_boot2 <- bootstraps(df_hbox_model2, times = 1000)

```

```{r model_spec2}

# recipe
hbox_rec2 <- recipe(Status ~ ., data = df_hbox_model2) %>% 
  step_BoxCox(all_predictors()) %>% 
  step_nzv(all_predictors())

# workflow
hbox_wf2 <- workflow() %>% 
  add_recipe(hbox_rec2)

# logistic regression model spec
glm_spec2 <- logistic_reg() %>% 
  set_engine("glm")

```

```{r train_model2, eval = FALSE}

glm_res2 <- hbox_wf2 %>% 
  add_model(glm_spec2) %>%
  fit_resamples(
    resamples = hbox_boot2,
    metrics = metric_set(roc_auc, accuracy, sensitivity, specificity),
    control = tune::control_resamples(save_pred = TRUE, verbose = TRUE)
  )

save(glm_res, glm_res2, file = "glm_res.Rdata")

```   

```{r best2}

# select best model to be used for evaluation
glm_best2 <- glm_res2 %>% select_best("roc_auc")

# finalize model
hbox_final2 <- hbox_wf2 %>% 
  add_model(glm_spec2) %>% 
  finalize_workflow(glm_best2) %>%
  fit(df_hbox_model2) %>%
  pull_workflow_fit()

```

#### Evaluate

```{r eval_training2}

# collect metrics
collect_metrics(glm_res2) %>% 
  dplyr::select(.metric, mean, n, std_err) %>% 
  as.data.frame()

# confusion matrix on training data
glm_res2 %>% 
  conf_mat_resampled() %>% 
  mutate(Prediction = ifelse(Prediction == 1, "CM", "UM")) %>% 
  mutate(Truth = ifelse(Truth == 1, "CM", "UM")) %>% 
  
  # plot
  ggplot(aes(x = Truth, y = Prediction, fill = Freq, label = Freq)) +
  geom_tile() +
  geom_text(size = 6) +
  scale_fill_gradient2(low = "#4575b4", high = "#d73027") +
  
  ggtitle("Confusion matrix") +
  theme_bw()

```

```{r roc2, eval = FALSE}

glm_res2 %>% 
  collect_predictions() %>% 
  group_by(id) %>% 
  roc_curve(Status, .pred_0) %>% 
  
  ggplot(aes(1 - specificity, sensitivity, color = id)) +
  geom_abline(lty = 2, color = "gray60", size = 1.5) +
  geom_path(show.legend = FALSE, alpha = 0.6, size = 1.2) +
  coord_equal() +
  ggtitle("ROC") +
  theme_bw()

```

Coefficients
```{r coefficients2}

hbox_final2 %>% 
  tidy() %>% 
  as.data.frame()

```

Odds ratios
```{r OR2}

hbox_final2 %>% 
  tidy(exponentiate = TRUE) %>% 
  arrange(desc(estimate)) %>% 
  as.data.frame()

```

## THC_alpha & THC_sd

```{r violin_plot}

df_hbox_log_voi %>% 
  pivot_longer(2:10, names_to = "variable", values_to = "value") %>% 
  filter(variable %in% c("THC_alpha", "THC_sd")) %>% 
  
  ggplot(aes(x = Status, y = value)) +
  geom_violin(aes(color = Status)) +
  geom_jitter(position = position_jitter(0.2), shape = 1) +
  stat_summary(fun = "median", geom = "crossbar", aes(color = Status), size = 0.2, width = 0.5) +
  facet_wrap(~variable, scales = "free_y") +
  
  ggtitle("Violin plot") +
  theme_bw() +
  theme(legend.position = "none")  +
  
  theme(strip.text = element_text(size = 15),
        axis.title = element_text(size = 15),
        axis.title.y = element_blank(),
        axis.text = element_text(size = 15))

```

```{r box_plot}

# calculate quantiles of THC_alpha scores
Q1 <- quantile(df_hbox_log_voi$THC_alpha, 0.25)
Q2 <- quantile(df_hbox_log_voi$THC_alpha, 0.50)
Q3 <- quantile(df_hbox_log_voi$THC_alpha, 0.75)

df_hbox_log_voi %>% 
  dplyr::select(c("Status", "THC_alpha", "THC_sd")) %>% 
  
  # assign quantiles to THC_alpha scores
  mutate(THCa_quartile = case_when(
    
    THC_alpha < Q1 ~ 1,
    Q1 < THC_alpha & THC_alpha < Q2 ~ 2,
    Q2 < THC_alpha & THC_alpha <= Q3 ~ 3,
    Q3 < THC_alpha ~ 4

  ) %>% as.factor()) %>%

  # plot
  ggplot(aes(x = Status, y = THC_sd)) +
  geom_boxplot(show.legend = FALSE, outlier.shape = NA) +
  geom_point(aes(color = THCa_quartile), position = position_jitter(0.2), size = 3) +
  stat_summary(aes(color = THCa_quartile), fun = "median", geom = "crossbar", size = 0.3, width = 0.5) +
  scale_color_manual(values = c("#F8766D", "#7CAE00", "#00BFC4", "#C77CFF", "black", "black")) +

  ggtitle("Relationship between THC_sd and THC_alpha") +
  theme_bw() 

```

```{r scatter_plot}


cm_model <- lm(THC_sd ~ THC_alpha, data = df_hbox_log_voi %>% filter(Status == "CM"))
um_model <- lm(THC_sd ~ THC_alpha, data = df_hbox_log_voi %>% filter(Status == "UM"))

df_hbox_log_voi %>%
  
  ggplot(aes(x = THC_alpha, y = THC_sd, color = Status, fill = Status)) +
  geom_point(size = 3, alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE) +
  
  geom_text(label = paste0("y = ", 
                           round(cm_model$coefficients[[1]], 2), 
                           " + ", 
                           round(cm_model$coefficients[[2]], 2), 
                           "x ; R^2 = ", 
                           round(summary(cm_model)$adj.r.squared, 2)),
            x = 1.3, y = 0.55,
            color = "#00BFC4",
            size = 6) +
  
    geom_text(label = paste0("y = ", 
                           round(um_model$coefficients[[1]], 2), 
                           " + ", 
                           round(um_model$coefficients[[2]], 2), 
                           "x ; R^2 = ", 
                           round(summary(um_model)$adj.r.squared, 2)),
            x = 1.3, y = 0.6,
            color = "#F8766D",
            size = 6) +

  
  ggtitle("Scatter plot") +
  theme_bw()

```


